{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from utils import tokenize, load_curpus\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 加载全部不同主题的语料"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /var/folders/dy/xjy0y7v97js5x1bghby2fnkm0000gn/T/jieba.cache\n",
      "Loading model cost 0.667 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import glob\n",
    "data = {}\n",
    "for f in glob.glob(\"weibo2018/topics/*.txt\"):\n",
    "    topic = os.path.split(f)[-1].split(\".\")[0]\n",
    "    data[topic] = load_curpus(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "加载停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "stopwords = []\n",
    "with open(\"stopwords.txt\", \"r\", encoding=\"utf8\") as f:\n",
    "    for w in f:\n",
    "        stopwords.append(w.strip())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "加载之前训练好的FastText模型\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from gensim.models import FastText\n",
    "model = FastText.load(\"model/model_100.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 为保证输入神经网络的向量长度一致, 要对长度不足max_length的句子用零向量补齐, 对长度超过max_length的句子进行截断"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "max_length = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/dengxiuqi/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n",
      "  \n",
      "/Users/dengxiuqi/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  if __name__ == '__main__':\n"
     ]
    }
   ],
   "source": [
    "data_X, data_length = {}, {}\n",
    "\n",
    "for topic, corpus in data.items():\n",
    "    _data_X, _data_length = [], []\n",
    "    for content, sentiment in corpus:\n",
    "        X = []\n",
    "        for w in content[:max_length]:\n",
    "            if w in model:\n",
    "                X.append(np.expand_dims(model[w], 0))\n",
    "        if X:\n",
    "            length = len(X)\n",
    "            X = X + [np.zeros_like(X[0])] * (max_length - length)\n",
    "            X = np.concatenate(X)\n",
    "            X = np.expand_dims(X, 0)\n",
    "            _data_X.append(X)\n",
    "            _data_length.append(length)\n",
    "    data_X[topic] = _data_X\n",
    "    data_length[topic] = _data_length"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Attention+LSTM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/dengxiuqi/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /Users/dengxiuqi/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py:430: calling reverse_sequence (from tensorflow.python.ops.array_ops) with seq_dim is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "seq_dim is deprecated, use seq_axis instead\n",
      "WARNING:tensorflow:From /Users/dengxiuqi/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:454: calling reverse_sequence (from tensorflow.python.ops.array_ops) with batch_dim is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "batch_dim is deprecated, use batch_axis instead\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow.contrib import rnn, seq2seq\n",
    "batch_size = 100\n",
    "lr = 1e-3\n",
    "hidden_size = 100\n",
    "\n",
    "X = tf.placeholder(shape=(batch_size, max_length, 100), dtype=tf.float32, name=\"X\")\n",
    "L = tf.placeholder(shape=(batch_size), dtype=np.int32, name=\"L\")\n",
    "y = tf.placeholder(shape=(batch_size, 1), dtype=np.float32, name=\"y\")\n",
    "dropout = tf.placeholder(shape=(), dtype=np.float32, name=\"dropout\")\n",
    "with tf.variable_scope(\"lstm\", reuse=tf.AUTO_REUSE):\n",
    "    def lstm_cell(hidden_size, cell_id=0):\n",
    "        # LSTM细胞生成器\n",
    "        cell = rnn.LSTMCell(hidden_size, reuse=tf.AUTO_REUSE, name='cell%d' % cell_id)\n",
    "        cell = rnn.DropoutWrapper(cell, output_keep_prob=dropout)\n",
    "        return cell\n",
    "    \n",
    "    context = tf.get_variable(\"context\", shape=(1, hidden_size))\n",
    "    context = tf.tile(context, [batch_size, 1])\n",
    "    fw_cell = lstm_cell(hidden_size, 0)\n",
    "    bw_cell = lstm_cell(hidden_size, 1)\n",
    "    fw_zero = fw_cell.zero_state(batch_size, tf.float32)\n",
    "    bw_zero = fw_cell.zero_state(batch_size, tf.float32)\n",
    "    encoder_output, encoder_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell,\n",
    "                                                         cell_bw=bw_cell,\n",
    "                                                         inputs=X,\n",
    "                                                         sequence_length=L,\n",
    "                                                         initial_state_fw=fw_zero,\n",
    "                                                         initial_state_bw=bw_zero,\n",
    "                                                         dtype=tf.float32)\n",
    "    attention_context = tf.concat(encoder_output, axis=2)\n",
    "    attention_mech = seq2seq.BahdanauAttention(hidden_size * 2,\n",
    "                                                 memory=attention_context,\n",
    "                                                 memory_sequence_length=L,\n",
    "                                                 name=\"AttentionMechanism\")\n",
    "    attention_cell = seq2seq.AttentionWrapper(cell=lstm_cell(hidden_size, 2),\n",
    "                                                attention_mechanism=attention_mech,\n",
    "                                                attention_layer_size=hidden_size,\n",
    "                                                alignment_history=True,\n",
    "                                                output_attention=True,\n",
    "                                                name=\"AttentionCell\")\n",
    "    attention_zero = attention_cell.zero_state(batch_size, tf.float32)\n",
    "    attention_output, attention_state = attention_cell.call(context, attention_zero)\n",
    "    aligments = attention_state[3]\n",
    "    \n",
    "    W1 = tf.get_variable(\"W1\", shape=(hidden_size, 50))\n",
    "    b1 = tf.get_variable(\"b1\", shape=(50,))\n",
    "    W2 = tf.get_variable(\"W2\", shape=(50, 1))\n",
    "    b2 = tf.get_variable(\"b2\", shape=(1,))\n",
    "    fcn1 = tf.nn.xw_plus_b(attention_output, W1, b1)\n",
    "    logists = tf.nn.xw_plus_b(fcn1, W2, b2)\n",
    "    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logists, labels=y))\n",
    "    op = tf.train.AdamOptimizer(lr).minimize(loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)\n",
    "config = tf.ConfigProto(gpu_options=gpu_options)\n",
    "sess = tf.Session(config=config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 加载模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Restoring parameters from model/attention/model-1000\n"
     ]
    }
   ],
   "source": [
    "sess.run(tf.global_variables_initializer())\n",
    "saver = tf.train.Saver()\n",
    "checkPoint = tf.train.get_checkpoint_state(\"model/attention\")\n",
    "saver.restore(sess, checkPoint.model_checkpoint_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 对不同主题的文本进行情感分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentiment = {}\n",
    "for topic in data_X.keys():\n",
    "    _X = np.concatenate(data_X[topic] + [np.zeros_like(data_X[topic][0])] * (batch_size - len(data_X[topic])))\n",
    "    _L = np.array(data_length[topic] + [1] * (batch_size - len(data_length[topic])))\n",
    "    result = sess.run(tf.nn.sigmoid(logists), feed_dict={X: _X, L: _L, dropout:1.})\n",
    "    prediction = []\n",
    "    for i in result[:len(data_X[topic])]:\n",
    "        if i > 0.5:\n",
    "            prediction.append(1)\n",
    "        else:\n",
    "            prediction.append(0)\n",
    "    sentiment[topic] = prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "主题为【同济大学】的微博中, 正面:88, 负面:12\n",
      "主题为【周杰伦】的微博中, 正面:88, 负面:12\n",
      "主题为【好莱坞】的微博中, 正面:79, 负面:21\n",
      "主题为【人工智能】的微博中, 正面:79, 负面:21\n",
      "主题为【特朗普】的微博中, 正面:55, 负面:44\n",
      "主题为【毕业】的微博中, 正面:78, 负面:22\n"
     ]
    }
   ],
   "source": [
    "for topic, res in sentiment.items():\n",
    "    print(\"主题为【%s】的微博中, 正面:%d, 负面:%d\" % (topic, res.count(1), res.count(0)))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
